capture log close
clear all
set more off

* Stablish Working Directory ***************************************************
cd "$workdirectory"

*-------------------------------------------------------------------------------
* Canada - For Canada we count classwk shares by metro area (METROCA) and then match it directly to the FUA dataset

* Importing Files

import delimited raw_datasets\Maps\classwk_individual_countries\canada\canada_1981_classwk.csv, varnames(11) rowrange(12) clear
gen country_year = "canada_1981"
save "processed_datasets\maps_and_cities_canada_classwk", replace

import delimited raw_datasets\Maps\classwk_individual_countries\canada\canada_1991_classwk.csv, varnames(11) rowrange(12) clear
gen country_year = "canada_1991"
append using "processed_datasets\maps_and_cities_canada_classwk"
save "processed_datasets\maps_and_cities_canada_classwk", replace

import delimited raw_datasets\Maps\classwk_individual_countries\canada\canada_2001_classwk.csv, varnames(11) rowrange(12) clear
gen country_year = "canada_2001"
append using "processed_datasets\maps_and_cities_canada_classwk"
save "processed_datasets\maps_and_cities_canada_classwk", replace

* Cleaning
keep if v1 == "Weighted N"
drop v1
drop if v2 == "0 Not in a metropolitan area"
drop if v2 == "COL TOTAL"

* Genearting efua_id variable
* Notes: Windsor is identified as "Detroit" in the FUA dataset
* Sudbury and Thunder Bay; Regina and Saskatoon; Sherbrooke and Trois-RiviÃ¨res; are dropped as they are merged in the classwk dataset but are not a single FUA
* Oshawa and Hamilton belong to Toronto in the FUA dataset

gen efua_id = 0

replace efua_id = 3865 if v2 == "1 Halifax"
replace efua_id = 2360 if v2 == "10 Kitchener"
replace efua_id = 2220 if v2 == "11 London"
replace efua_id = 1996 if v2 == "12 Windsor"
replace efua_id = 1837 if v2 == "14 Winnipeg"
replace efua_id = 1276 if v2 == "16 Calgary"
replace efua_id = 1572 if v2 == "17 Edmonton"
replace efua_id = 375  if v2 == "18 Vancouver"
replace efua_id = 28   if v2 == "19 Victoria"
replace efua_id = 3789 if v2 == "2 QuÃ©bec"
replace efua_id = 3485 if v2 == "3 MontrÃ©al"
replace efua_id = 3343 if v2 == "5 Ottawa-Hull"
replace efua_id = 2626 if v2 == "6 Oshawa"
replace efua_id = 2626 if v2 == "7 Toronto"
replace efua_id = 2626 if v2 == "8 Hamilton"
replace efua_id = 2751 if v2 == "9 St. Catharines-Niagara"

drop if v2 == "13 Sudbury and Thunder Bay"
drop if v2 == "15 Regina and Saskatoon"
drop if v2 == "4 Sherbrooke and Trois-RiviÃ¨res"

drop v2

* We now collapse to solve the issue of multiple cities belonging to Toronto in the FUA dataset
collapse (sum) niunotinuniverse selfemployed wagesalaryworker rowtotal unpaidworker, by(efua_id country_year)

save "processed_datasets\maps_and_cities_canada_classwk", replace

* Adding FUA population
import delimited raw_datasets\Maps\intersection.csv, clear
keep if cntry_na_1 == "Canada"
collapse (mean) fua_p_2015 (first) efua_name, by(efua_id)

tempfile can
save `can'

use "processed_datasets\maps_and_cities_canada_classwk", clear
merge m:1 efua_id using `can'
keep if _merge == 3
drop _merge

* Formating to fit main dataset
foreach v of varlist niunotinuniverse selfemployed wagesalaryworker rowtotal unpaidworker{
	gen _`v' = `v'
}

drop niunotinuniverse selfemployed wagesalaryworker rowtotal unpaidworker

* Calculating shares
foreach v2 of varlist _selfemployed _wagesalaryworker _unpaidworker{
	gen share`v2' = `v2'/(_rowtotal-_niunotinuniverse)
}

* We generate variables for classwk that are not present in the census
gen _other 					= 0
gen _unknownmissing 		= 0
gen share_other 			= 0
gen share_unknownmissing 	= 0

save "processed_datasets\maps_and_cities_canada_classwk", replace

*-------------------------------------------------------------------------------
* Philippines (2000) - For Philippines we count industry shares by city (CITYPH) and then match it directly to the FUA dataset

import delimited raw_datasets\Maps\classwk_individual_countries\philippines\philippines_2000_classwk.csv, varnames(11) rowrange(12) clear
gen country_year = "philippines_2000"
save "processed_datasets\maps_and_cities_philippines_classwk", replace

* Cleaning
keep if v1 == "Weighted N"
drop v1
drop if v2 == "99 Not in an identified city"
drop if v2 == "COL TOTAL"

* Genearting efua_id variable
* Notes: 
* Mandaue belongs to Cebu's FUA
* Mandaluyong; Marikina; Pasig; Valenzuela; Kalookan; Las PiÃ±as; Makati; Muntinlupa ; ParaÃ±aque; Pasay belong to Manila's FUA

gen efua_id = 0

replace efua_id = 4852 if v2 == "1 Butuan"
replace efua_id = 999  if v2 == "2 Baguio"
replace efua_id = 4372 if v2 == "3 Cebu"
replace efua_id = 4372 if v2 == "4 Mandaue"
replace efua_id = 4985 if v2 == "5 Davao"
replace efua_id = 3888 if v2 == "6 Iloilo"
replace efua_id = 4634 if v2 == "7 Iligan"
replace efua_id = 4750 if v2 == "8 Cagayan De Oro"
replace efua_id = 4101 if v2 == "9 Bacolod"
replace efua_id = 1797 if v2 == "10 Angeles"
replace efua_id = 3687 if v2 == "11 Lucena"
replace efua_id = 4910 if v2 == "12 General Santos"
replace efua_id = 1528 if v2 == "13 Olongapo"
replace efua_id = 4067 if v2 == "14 Zamboanga"
replace efua_id = 2845 if v2 == "15 Mandaluyong"
replace efua_id = 2845 if v2 == "16 Marikina"
replace efua_id = 2845 if v2 == "17 Pasig"
replace efua_id = 2845 if v2 == "18 Quezon"
replace efua_id = 2845 if v2 == "19 Valenzuela"
replace efua_id = 2845 if v2 == "20 Kalookan"
replace efua_id = 2845 if v2 == "21 Las PiÃ±as"
replace efua_id = 2845 if v2 == "22 Makati"
replace efua_id = 2845 if v2 == "23 Muntinlupa"
replace efua_id = 2845 if v2 == "24 ParaÃ±aque"
replace efua_id = 2845 if v2 == "25 Pasay"
replace efua_id = 4722 if v2 == "26 Cotabato"
replace efua_id = 4693 if v2 == "27 Marawi"
replace efua_id = 2845 if v2 == "28 Manila"

drop v2

* We now collapse to solve the issue of multiple cities belonging to a single FUA
collapse (sum) niunotinuniverse selfemployed wagesalaryworker unpaidworker unknownmissing rowtotal, by(efua_id country_year)

save "processed_datasets\maps_and_cities_philippines_classwk", replace

* Adding FUA population

import delimited raw_datasets\Maps\intersection.csv, clear
keep if cntry_na_1 == "Philippines"
collapse (mean) fua_p_2015 (first) efua_name, by(efua_id)

tempfile plp
save `plp'

use "processed_datasets\maps_and_cities_philippines_classwk", clear
merge m:1 efua_id using `plp'
keep if _merge == 3
drop _merge

* Formating to fit main dataset
foreach v of varlist niunotinuniverse selfemployed wagesalaryworker unpaidworker unknownmissing rowtotal{
	gen _`v' = `v'
}

drop niunotinuniverse selfemployed wagesalaryworker unpaidworker unknownmissing rowtotal

* Calculating shares
foreach v2 of varlist _selfemployed _wagesalaryworker _unpaidworker _unknownmissing{
	gen share`v2' = `v2'/(_rowtotal-_niunotinuniverse)
}

* We generate variables for classwk that are not present in the census
gen _other 					= 0
gen share_other 			= 0

save "processed_datasets\maps_and_cities_philippines_classwk", replace


*-------------------------------------------------------------------------------
* United States

*-------------------------------------------------------------------------------
* Creating shares of subdistricts located within a FUA

* Importing intersection of FUAs and districts
import delimited raw_datasets\Maps\intersection_usa.csv, encoding(UTF-8) clear
save "processed_datasets\maps_and_cities_usa_classwk", replace

* Importing area of districts
* We drop observations without geolevel2

import delimited raw_datasets\Maps\area_subdist_usa.csv, encoding(UTF-8) clear
keep geolevel2 area_subd
drop if geolevel2 == .
tempfile area_usa
save `area_usa', replace

* Merging and obtaining shares
use "processed_datasets\maps_and_cities_usa_classwk", clear
merge m:1 geolevel2 using `area_usa'
keep if _merge == 3
drop _merge

gen share = area_calcu/area_subd
sort geolevel2
save "processed_datasets\maps_and_cities_usa_classwk", replace

********************************************************************************
* Generating industry per city data

* Generating empty file to attach data to
clear all
tempfile data_classwk_cities_usa
save `data_classwk_cities_usa.dta', empty replace

* Genearting local with all .csv in the folder
local files : dir "raw_datasets\Maps\classwk_individual_countries\usa" files "*.csv"

cd raw_datasets\Maps\classwk_individual_countries\usa

* Importing, appending and saving
foreach fi in `files' {
  import delimited `fi', varnames(12) rowrange(13) clear
  gen country_year = subinstr("`fi'","_classwk.csv","",.)
  append using `data_classwk_cities_usa'
  save `data_classwk_cities_usa', replace
}

* Cleaning
keep if v1 == "Weighted N"
drop v1
rename v2 census
drop if census == "COL TOTAL"

* Exctracting geolevel2
gen geolevel2 = regexs(0) if(regexm(census, "[0-9]+"))
destring geolevel2, replace

sort geolevel2
save `data_classwk_cities_usa', replace

********************************************************************************

cd ..\..\..\..

********************************************************************************
* Genearting final dataset

* Joining both datasets
use "processed_datasets\maps_and_cities_usa_classwk", clear
joinby geolevel2 using `data_classwk_cities_usa'

* Applying Shares
foreach v of varlist niunotinuniverse selfemployed wagesalaryworker unpaidworker rowtotal{
	gen _`v' = `v'*share
}

* Fixing country names from both datasets
replace cntry_na_1 = "United States" if cntry_na_1 == "UnitedStates"

* Drop intersections which associate districts from country i to a city located in country j
drop if cntry_name != cntry_na_1

* Collapsing by city and census
collapse (sum) _* (mean) fua_p_2015 (first) efua_name ,by(efua_id country_year)

foreach v2 of varlist _selfemployed _wagesalaryworker _unpaidworker{
	gen share`v2' = `v2'/(_rowtotal-_niunotinuniverse)
}

* We generate variables for classwk that are not present in the US census
gen _other 					= 0
gen _unknownmissing 		= 0
gen share_other 			= 0
gen share_unknownmissing 	= 0

save "processed_datasets\maps_and_cities_usa_classwk", replace

*-------------------------------------------------------------------------------
* Italy - For Italy we count industry shares by city and then match it directly to the FUA dataset

use raw_datasets\Maps\classwk_individual_countries\italy\italy_classwk, clear

* We add the code for each different year into a single variable
egen city = rowtotal(it2014h_gracom it2015h_gracom it2016h_gracom it2017h_gracom it2018h_gracom it2019h_gracom)

* Keeping only the variables we need
keep year perwt classwk city

* Dropping non-cities
drop if city == 99999

* Genearting efua_id variable
gen efua_id = 0
replace efua_id = 85 	if city == 1272
replace efua_id = 1309 	if city == 10025
replace efua_id = 1600 	if city == 15146
replace efua_id = 2772 	if city == 23091
replace efua_id = 3592 	if city == 27042
replace efua_id = 3157 	if city == 37006
replace efua_id = 3106 	if city == 48017
replace efua_id = 3881	if city == 58091
replace efua_id = 4333 	if city == 63049
replace efua_id = 5004	if city == 72006
replace efua_id = 4232 	if city == 82053
replace efua_id = 4771 	if city == 83048
replace efua_id = 4658 	if city == 87015
drop city

* Genearting country_year variable
gen country_year = ""
foreach y of numlist 2014/2019{
    replace country_year = "italy_`y'" if year == `y'
}
drop year

* Summing perwt at year, indgen and efua_id level
collapse (sum) perwt, by(country_year classwk efua_id)

* Reshaping to fit main dataset format
sort country_year efua_id
reshape wide perwt, i(country_year efua_id) j(classwk)

* Renaming to match with main dataset
rename perwt0		niunotinuniverse
rename perwt1		selfemployed
rename perwt2		wagesalaryworker
rename perwt3		unpaidworker
*rename perwt4		other
*rename perwt9		unknownmissing

* Generating Row Total variables
egen rowtotal = rowtotal(niunotinuniverse selfemployed wagesalaryworker unpaidworker)

* Formating to fit main dataset
foreach v of varlist niunotinuniverse selfemployed wagesalaryworker unpaidworker rowtotal{
	gen _`v' = `v'
}

drop niunotinuniverse selfemployed wagesalaryworker unpaidworker rowtotal

* Replacing with 0 when missing
foreach v3 of varlist _niunotinuniverse _selfemployed _wagesalaryworker _unpaidworker _rowtotal{
	replace `v3' = 0 if `v3' == .
}

* Calculating shares
foreach v2 of varlist _selfemployed _wagesalaryworker _unpaidworker{
	gen share`v2' = `v2'/(_rowtotal-_niunotinuniverse)
}

save "processed_datasets\maps_and_cities_italy_classwk", replace

* Adding FUA population

import delimited raw_datasets\Maps\intersection.csv, clear
keep if cntry_na_1 == "Italy"
collapse (mean) fua_p_2015 (first) efua_name, by(efua_id)

tempfile ita
save `ita'

use "processed_datasets\maps_and_cities_italy_classwk", clear
merge m:1 efua_id using `ita'
keep if _merge == 3
drop _merge

* We generate variables for classwk that are not present in the census
gen _other 					= 0
gen _unknownmissing 		= 0
gen share_other 			= 0
gen share_unknownmissing 	= 0

save "processed_datasets\maps_and_cities_italy_classwk", replace
